#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.8a  - 22-10-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python: Main File
###########################

#https://docs.python.org/2/library/configparser.html
import os
import sys
reload(sys)
sys.path.append('/home/onetipp/python/modules')
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
sys.setdefaultencoding('utf-8')

                                          # apt-get install python-mysqldb
from sphinxit.core.processor import Search                  # http://sphinxit.readthedocs.org/en/latest/
import codecs
import re
from transliterate import translit, get_available_language_codes
import onetipp

noDoubleHash    = set()
###re_match        = r"[(\?|\.|\!)][(\t|\r|\n|\s|\w){0,}]([A-Za-z0-9]{1,})" # Match: ". WORT"
re_match        = r"(\?|\.|\!)$" # Match: ". WORT"

# lies die Ein und Ausgabedateien
inputfile   = sys.argv[1]
outputfile  = sys.argv[2]

# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = open(inputfile, 'r').read()
#text.decode('utf-8')
text = text.decode("utf-8")

# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy       = onetipp.summarizeText(text)
tokens      = onetipp.nltk.word_tokenize(tSumy)
tokensRaw   = onetipp.nltk.word_tokenize(text)

count                   = -1
changeEveryWord         = 6 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden
changeEveryWordFlag     = 0
changeEveryWordTemp     = 0 #temporary upcount
ignoreNextWord          = 0

for word in tokens:
    count += 1

    if ignoreNextWord == 1:
        ignoreNextWord = 0
        changeEveryWordFlag = 0
        continue

    wordTemp = word.encode('ascii', 'ignore')
    # cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word))
    try:
        onetipp.cursorMysql.execute("SELECT * FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (wordTemp))
        name_content = onetipp.cursorMysql.fetchone()
    except:
        print("Der Namensparser die Datenbank konnte nicht angesprochen werden")
        name_content = None

    #print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word))
    #print (name_content)

    #    search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig)
    #    # search_query = search_query.match(word).options(
    #    search_query = search_query.match(word).options(
    #        ranker='proximity_bm25',
    #        max_matches=1,
    #        max_query_time=350,
    #        field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000},
    #    )
        ###sphinx_result = search_query.ask()
        # exit(0)

    # es wurde ein namen gefunden -> kein synonym austauschen
    if name_content is not None:
        # print("Token: ", tokens)
        #print("Count: ", count)
        #print("<br>")
        #print("Tokencount overall: ", len(tokens))
        #print("<br>")
        tokens[count] = '<b style="color:#00FFFF;" title="Namen erkannt"><i>' + onetipp.deumlaut(word) + '</i></b>'
     #   tokens[count] = onetipp.deumlaut(word)
        tokensRaw[count] = onetipp.deumlaut(word)

        # Da Namen oft aus Vorname und Nachname bestehen, soll direkt nach einem Namen auch nicht gewechselt werden
        ignoreNextWord = 1

    if changeEveryWordTemp == (changeEveryWord - 1):
        changeEveryWordFlag     = 0
        changeEveryWordTemp     = 0
        ignoreNextWord          = 0
    else:
        1

    if changeEveryWordFlag == 1:
        changeEveryWordTemp += 1
    else:
        1

    if len(word) >= 4 and changeEveryWordFlag == 0 and ignoreNextWord == 0:
        print("IgnoreNextWord: und aktuelles wort: ", ignoreNextWord, word)

        # Versuche zuerst die Leipzig DB anzufordern

        lstcWord        = word[0:1]
        synDictLeipzig  = {}
        sLeipzigList    = onetipp.getSynLeipzig(word)

        if sLeipzigList:
            for wSynL in sLeipzigList:
                #synDict[SynRanker(wSyn, word)] = wSyn
                if wSynL not in noDoubleHash:
                    synDictLeipzig[wSynL] = onetipp.SynRanker(wSynL, word)

            sortedSynList       = []
            sortedSynList       = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True)
            firstBestSynHit     = str(sortedSynList[0][0])
            firstBestSynHitRank = str(sortedSynList[0][1])

            # keine doppelten Synonyme verwenden
            noDoubleHash.add(firstBestSynHit)

            # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
            if re.search(re_match, tokens[count-1]) is not None:
                firstBestSynHit.title()

            firstBestSynHit = onetipp.putPunctuation(word, firstBestSynHit)

            # later: Randomly choose one of the synonyms that have all the highest rating
            tokens[count] = '<b style="color:#FF99FF; text-decoration: underline" title="SynRank(' + \
                            firstBestSynHitRank + ') /Leipzig DB/ ->Synonym ausgetauscht!"><i>' + onetipp.deumlaut(firstBestSynHit) + '</i></b>'

          #  tokens[count] = onetipp.deumlaut(firstBestSynHit)

            tokensRaw[count]    = onetipp.deumlaut(firstBestSynHit)
            changeEveryWordFlag = 1
            changeEveryWordTemp += 1

        else:

            #nutze unsere lokale Synonym Mysql Datenbank
            search_query_syn = Search(indexes=['onetipp_syn_simple'], config=onetipp.SphinxitConfig)
            search_query_syn = search_query_syn.match(word).options(
                ranker='proximity_bm25',
                max_matches=1,
                max_query_time=350,
                field_weights={'synonyms': 100},
            )
            sphinx_result_syn = search_query_syn.ask()
            synID = 0

            try:
                synID = sphinx_result_syn['result']['items'][0].values()[0]
                if synID > 0:
                    #    print "SynDB has been found: ", synID

                    #später finde via sphinx noch mehr synonyme und parse diese alle
                    sql         = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID)
                    onetipp.cursorMysql.execute(sql)
                    syn_content = onetipp.cursorMysql.fetchone()
                    synContent  = list(syn_content)
                    synContent  = synContent[0].decode(encoding="utf-8", errors="ignore")

                    if syn_content:
                        synwords = synContent.split(";")
                        # print SynDictCalculator(synwords)

                        #    http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/
                        #    for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)):
                        #       print "%s: %s" % (key, value)

                        synDict = {}
                        for wSyn in synwords:
                            #synDict[SynRanker(wSyn, word)] = wSyn
                            if wSyn not in noDoubleHash:
                                synDict[wSyn] = onetipp.SynRanker(wSyn, word)

                        sortedSynList       = []
                        sortedSynList       = sorted(synDict.items(), key=lambda x: x[1], reverse=True)
                        firstBestSynHit     = str(sortedSynList[0][0])
                        firstBestSynHitRank = str(sortedSynList[0][1])

                         # keine doppelten Synonyme verwenden
                        noDoubleHash.add(firstBestSynHit)

                        # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
                        if re.search(re_match, tokens[count-1]) is not None:
                            firstBestSynHit.title()

                        firstBestSynHit = onetipp.putPunctuation(word, firstBestSynHit)

                        # later: Randomly choose one of the synonyms that have all the highest rating
                        tokens[count]       = '<b style="color:#FF99FF; text-decoration: underline" title="SynRank(' \
                                              + firstBestSynHitRank + ') /LocalMysqL DB/ ->Synonym ' \
                                                                      'ausgetauscht!"><i>' + onetipp.deumlaut(
                            firstBestSynHit) + '</i></b>'

                      #  tokens[count] = onetipp.deumlaut(firstBestSynHit)

                        tokensRaw[count] = onetipp.deumlaut(firstBestSynHit)
                        changeEveryWordFlag = 1
                        changeEveryWordTemp += 1
                        #break

            except IndexError:
                1

# file schreiben
outputtext          = ' '.join(tokens)
outputtextRaw       = ' '.join(tokensRaw)

readabilityVar      = str(onetipp.textstat.flesch_reading_ease(outputtextRaw))

with codecs.open(outputfile, 'w') as f:
    f.write(outputtext)
    f.close()

with codecs.open(outputfile+".raw.txt", 'w') as f:
    f.write(outputtextRaw)
   # f.write("<span title=\"Flesch Reading Ease: (Grosser Wert=Einfacher zu lesen ### geringer Wert=Schwerer zu
   # "lesen)\">Lesbarkeitswert : </span>" + readabilityVar)
    #f.write("<br><br>")
    #f.write(outputtext)
    #f.write("<br><br>")
    #f.write("RUSSISCHE TRANSLITERATION: BEISPIEL VERSION")
    #f.write("<br><br>")
    #f.write(translit(outputtextRaw, 'ru'))
    f.close()

onetipp.mysql.commit()
onetipp.mysql.close()
exit(0)


"""
The Flesch Reading Ease formula

function name - flesch_reading_ease(text)

returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.

90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing

"""